This script uses a combination of the DEP-package and a limma-script that can handle paired samples (LA and RA from same horse). The DEP-package uses FDR-tools to generate q-values, which I have included in my own limma-script.

Required R libraries

if (!require("pacman")) install.packages("pacman")
## Indlæser krævet pakke: pacman
pacman::p_load("edgeR", "readr", "readxl", "biomaRt", "magrittr", "tibble", "stringr", 
               "ggplot2", "data.table", "patchwork", "openxlsx", "dplyr", "missForest", 
               "RColorBrewer", "limma", "DEqMS", "preprocessCore", "DEP", 
               "SummarizedExperiment", "Metrics", "fdrtool", "aamisc", "sva")

# install aamisc package for MDS and Volcano plots
# Commented out for future use:
# pacman::p_load("qvalue", "rain", "limma", "devtools")
# url <- "https://cran.r-project.org/src/contrib/Archive/HarmonicRegression/HarmonicRegression_1.0.tar.gz"
# pkgFile <- "HarmonicRegression_1.0.tar.gz"
# download.file(url = url, destfile = pkgFile)
# install.packages(pkgs=pkgFile, type="source", repos=NULL)
# file.remove(pkgFile)
# pacman::p_load_gh("altintasali/aamisc")

# Colours for publication
publication_colors <- c("Control" = "#285291", "Metformin" = "#7C1516", "Sham" = "#9B9B9B")

Read data and filtrer

# Load count data from a local file and metadata through Excel
# Define file paths
count_file <- "../../../data/count/report.unique_genes_matrix.tsv"
meta_file <- "../../../data/metadata/meta_proteomics.xlsx"
geneinfo_file <- "../../../data/gene_annotation/horse_gene_annotation.tsv.gz"

# Read count data
count <- readr::read_delim(count_file)
## Rows: 2967 Columns: 74
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): Genes
## dbl (73): D:\Mass_spectrometry\Raw_data\Joakim\Simon Horse second run2\1A_RA...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Read metadata
meta <- readxl::read_excel(meta_file)

# Remove unnecessary or blank columns from MS machine
count <- count %>% select(-matches("D:\\\\Mass_spectrometry\\\\Raw_data\\\\Joakim\\\\Simon Horse second run2\\\\10A_RA11_1_24830.d"))

# Read and clean gene annotation file
geneinfo <- fread(geneinfo_file)
setnames(geneinfo, old = names(geneinfo), new = c("ENSEMBL", "ENSEMBLv", "Description_detailed", 
                                                  "Chr", "Start", "End", "Strand", "GENENAME", "ENTREZID", "Description"))
geneinfo <- geneinfo %>% select(-ENSEMBLv, -Description_detailed) %>% distinct(ENSEMBL, .keep_all = TRUE)

# Merge gene annotation with count data
annot <- merge(count[, "Genes", drop = FALSE], geneinfo, by.x = "Genes", by.y = "GENENAME", all.x = TRUE)

# Clean count data
count <- count %>% remove_rownames() %>% column_to_rownames(var="Genes")
cleaned_names <- gsub("D:\\\\Mass_spectrometry\\\\Raw_data\\\\Joakim\\\\Simon Horse second run2\\\\|_.*", "", colnames(count))
colnames(count) <- cleaned_names
meta$`Sample ID` <- cleaned_names

#subset data to only RA terminal and baseline samples
meta <- meta[meta$Region %in% c("RA"), ]
count <- count[, colnames(count) %in% meta$`Sample ID`]


# Remove low count samples from Count and Meta
samples_to_remove <- c("25C", "12A", "22A", "19A")
count <- count[, !colnames(count) %in% samples_to_remove]
meta <- meta %>% filter(!`Sample ID` %in% samples_to_remove)

# Calculate the number of proteins in each sample
num_proteins <- colSums(!is.na(count))
num_proteins <- num_proteins[meta$`Sample ID`]  # Ensure matching order with meta

# Define color scheme for groups
KUalt <- c("#7C1516", "#285291", "#434343", "#999999")  # Custom color palette
group_colors <- setNames(KUalt[1:length(unique(meta$Group))], unique(meta$Group))
bar_colors <- group_colors[meta$Group]

# Plot the number of proteins in each sample before filtering
par(mfrow = c(1, 1))
barplot(num_proteins, 
        main = "Number of Proteins in Each Sample\nBefore Filtering",
        xlab = "Sample",
        ylab = "Number of Proteins",
        col = bar_colors,
        border = "black",
        ylim = c(0, 2500),
        las = 2)

# Print the total number of proteins before filtering
cat("The total number of proteins before filtering was", max(num_proteins), "\n")
## The total number of proteins before filtering was 2610
# Identify genes with less than three valid values in any condition
invalid_genes <- unique(unlist(lapply(unique(meta$Condition), function(condition) {
  condition_columns <- meta$`Sample ID`[meta$Condition == condition]
  condition_count <- count[, condition_columns, drop = FALSE]
  rownames(condition_count)[rowSums(!is.na(condition_count)) < 3]
})))

# Remove invalid genes from the original count table
filtered_count <- count[!rownames(count) %in% invalid_genes, ]

# Calculate the number of proteins in each sample after filtering
num_proteins_after <- colSums(!is.na(filtered_count))
num_proteins_after <- num_proteins_after[meta$`Sample ID`]  # Ensure matching order with meta

# Plot settings
par(mfrow = c(1, 2))

# First plot: Number of proteins in each sample before filtering
barplot(num_proteins, 
        main = "Number of Proteins in Each Sample\nBefore Filtering",
        xlab = "Sample",
        ylab = "Number of Proteins",
        col = bar_colors,
        border = "black",
        ylim = c(0, 2500),
        las = 2)

# Second plot: Number of proteins in each sample after filtering
barplot(num_proteins_after, 
        main = "Number of Proteins in Each Sample\nAfter Filtering",
        xlab = "Sample",
        ylab = "Number of Proteins",
        col = bar_colors,
        border = "black",
        ylim = c(0, 2500),
        las = 2)

# Print the total and average number of proteins before and after filtering
cat("The total number of proteins before filtering was", max(num_proteins), "\n")
## The total number of proteins before filtering was 2610
cat("The average number of proteins before filtering was", mean(num_proteins), "\n")
## The average number of proteins before filtering was 2168.578
cat("The total number of proteins after filtering was", max(num_proteins_after), "\n")
## The total number of proteins after filtering was 1288
cat("The average number of proteins after filtering was", mean(num_proteins_after), "\n")
## The average number of proteins after filtering was 1258.933

DEP-package https://bioconductor.org/packages/release/bioc/vignettes/DEP/inst/doc/DEP.html

Generate a SummarizedExperiment object

# Generate proxy columns similar to DEP's sample data to make the structure compatible.
proxy_column_names <- c("Protein.IDs", "Majority.protein.IDs", "Protein.names", "Gene.names", 
                        "Fasta.headers", "Peptides", "Razor...unique.peptides", "Unique.peptides", 
                        "Only.identified.by.site", "Reverse", "Potential.contaminant")
proxy_df <- data.frame(matrix(NA, nrow=nrow(filtered_count), ncol=length(proxy_column_names)))
colnames(proxy_df) <- proxy_column_names

# Assign row names to specific columns
rownames_to_columns <- rownames(filtered_count)
proxy_df[1:4] <- lapply(1:4, function(i) rownames_to_columns)

# Merge and rename sample columns
merged_df <- cbind(proxy_df, filtered_count)
colnames(merged_df)[12:ncol(merged_df)] <- paste0("LFQ.intensity.", colnames(filtered_count))

# Reorder columns
lfq_columns <- grep("^LFQ.intensity", colnames(merged_df), value = TRUE)
new_order <- c(proxy_column_names[1:8], lfq_columns, proxy_column_names[9:11])
data <- merged_df[, new_order]

print(colnames(data))
##  [1] "Protein.IDs"             "Majority.protein.IDs"   
##  [3] "Protein.names"           "Gene.names"             
##  [5] "Fasta.headers"           "Peptides"               
##  [7] "Razor...unique.peptides" "Unique.peptides"        
##  [9] "LFQ.intensity.1A"        "LFQ.intensity.1B"       
## [11] "LFQ.intensity.2A"        "LFQ.intensity.2B"       
## [13] "LFQ.intensity.3A"        "LFQ.intensity.3B"       
## [15] "LFQ.intensity.4A"        "LFQ.intensity.4B"       
## [17] "LFQ.intensity.5A"        "LFQ.intensity.5B"       
## [19] "LFQ.intensity.6A"        "LFQ.intensity.6B"       
## [21] "LFQ.intensity.7A"        "LFQ.intensity.7B"       
## [23] "LFQ.intensity.8A"        "LFQ.intensity.8B"       
## [25] "LFQ.intensity.9A"        "LFQ.intensity.9B"       
## [27] "LFQ.intensity.10A"       "LFQ.intensity.10B"      
## [29] "LFQ.intensity.11A"       "LFQ.intensity.11B"      
## [31] "LFQ.intensity.12B"       "LFQ.intensity.13A"      
## [33] "LFQ.intensity.13B"       "LFQ.intensity.14A"      
## [35] "LFQ.intensity.14B"       "LFQ.intensity.15A"      
## [37] "LFQ.intensity.15B"       "LFQ.intensity.16A"      
## [39] "LFQ.intensity.16B"       "LFQ.intensity.17A"      
## [41] "LFQ.intensity.17B"       "LFQ.intensity.18A"      
## [43] "LFQ.intensity.18B"       "LFQ.intensity.19B"      
## [45] "LFQ.intensity.20A"       "LFQ.intensity.20B"      
## [47] "LFQ.intensity.22B"       "LFQ.intensity.23A"      
## [49] "LFQ.intensity.23B"       "LFQ.intensity.24A"      
## [51] "LFQ.intensity.24B"       "LFQ.intensity.25A"      
## [53] "LFQ.intensity.25B"       "Only.identified.by.site"
## [55] "Reverse"                 "Potential.contaminant"
#Are there any duplicated gene names?
data$Gene.names %>% duplicated() %>% any()
## [1] FALSE
# Make unique names using the annotation in the "Gene.names" column as primary names and the annotation in "Protein.IDs" as name for those that do not have an gene name.
data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")

# Generate a SummarizedExperiment object using an experimental design
LFQ_columns <- grep("LFQ.", colnames(data_unique)) # get LFQ column numbers

# Prepare the metadata for DEP analysis
experimental_design <- data.frame(
  label = colnames(filtered_count),
  condition = meta$Condition,
  replicate = NA  # Initialize replicate column with NA
)

# Assign replicate numbers within each condition
experimental_design <- experimental_design %>%
  group_by(condition) %>%
  mutate(replicate = row_number()) %>%
  ungroup()

# Set row names
row.names(experimental_design) <- experimental_design$label
## Warning: Setting row names on a tibble is deprecated.
# Ensure the row names of experimental_design are unique
experimental_design <- as.data.frame(experimental_design)
rownames(experimental_design) <- make.unique(rownames(experimental_design))

# Print the experimental design to verify
print(experimental_design)
##    label             condition replicate
## 1     1A RA_Metformin_Baseline         1
## 2     1B  RA_Metformin_4months         1
## 3     2A   RA_Control_Baseline         1
## 4     2B    RA_Control_4months         1
## 5     3A RA_Metformin_Baseline         2
## 6     3B  RA_Metformin_4months         2
## 7     4A      RA_Sham_Baseline         1
## 8     4B       RA_Sham_4months         1
## 9     5A RA_Metformin_Baseline         3
## 10    5B  RA_Metformin_4months         3
## 11    6A   RA_Control_Baseline         2
## 12    6B    RA_Control_4months         2
## 13    7A RA_Metformin_Baseline         4
## 14    7B  RA_Metformin_4months         4
## 15    8A   RA_Control_Baseline         3
## 16    8B    RA_Control_4months         3
## 17    9A RA_Metformin_Baseline         5
## 18    9B  RA_Metformin_4months         5
## 19   10A   RA_Control_Baseline         4
## 20   10B    RA_Control_4months         4
## 21   11A RA_Metformin_Baseline         6
## 22   11B  RA_Metformin_4months         6
## 23   12B    RA_Control_4months         5
## 24   13A RA_Metformin_Baseline         7
## 25   13B  RA_Metformin_4months         7
## 26   14A   RA_Control_Baseline         5
## 27   14B    RA_Control_4months         6
## 28   15A   RA_Control_Baseline         6
## 29   15B    RA_Control_4months         7
## 30   16A   RA_Control_Baseline         7
## 31   16B    RA_Control_4months         8
## 32   17A RA_Metformin_Baseline         8
## 33   17B  RA_Metformin_4months         8
## 34   18A      RA_Sham_Baseline         2
## 35   18B       RA_Sham_4months         2
## 36   19B  RA_Metformin_4months         9
## 37   20A   RA_Control_Baseline         8
## 38   20B    RA_Control_4months         9
## 39   22B    RA_Control_4months        10
## 40   23A RA_Metformin_Baseline         9
## 41   23B  RA_Metformin_4months        10
## 42   24A      RA_Sham_Baseline         3
## 43   24B       RA_Sham_4months         3
## 44   25A      RA_Sham_Baseline         4
## 45   25B       RA_Sham_4months         4
# Create the SummarizedExperiment Object
# This will allow DEP to use the data for further analysis and visualization
data_se <- make_se(data_unique, LFQ_columns, experimental_design)

Protein Coverage & filtrer on missing values

# Plot a barplot of the protein identification overlap between samples
plot_frequency(data_se)

plot_numbers(data_se)

plot_coverage(data_se)

# Filter for proteins that are identified in all replicates of at least one condition #Commented out as we have already performed filtrering manually in the first step, which unlike the terminal samples, will be used here. 
#data_filt <- filter_missval(data_se, thr = 0)
#plot_frequency(data_filt)

Normalizaiton

#Check un-normalized data
plot_normalization(data_se)

#VSN-normalization
data_se_norm <- normalize_vsn(data_se)

#Plot VSN-norm data
meanSdPlot(data_se_norm, rank = TRUE)

plot_normalization(data_se_norm)

###Verdict: After variance stabilisation, the median (a reasonable estimator of the standard deviation of feature level data conditional on the mean) is approximately a horizontal line.

Impute data for missing values

#TODO: Data imputation may not be necessary for your case. limma can handle missing values.

# Plot a heatmap of proteins with missing values
plot_missval(data_se_norm)

# Plot intensity distributions and cumulative fraction of proteins with and without missing values
plot_detect(data_se_norm)

#The nature of the missing-ness:  the proteins with missing values have low intensities on average. This data (MNAR and close to the detection limit) should be imputed by a left-censored imputation method, such as the quantile regression-based left-censored function (“QRILC”) or random draws from a left-shifted distribution (“MinProb” and “man”). 

# Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR)
data_imp_MinProb <- impute(data_se_norm, fun = "MinProb", q = 0.01) 
## Imputing along margin 2 (samples/columns).
## [1] 0.4919121
# Impute missing data using random draws from a manually defined left-shifted Gaussian distribution (for MNAR)
data_imp_man <- impute( data_se_norm, fun = "man", shift = 1.8, scale = 0.3) 


# Impute missing data using Quantile Regression Imputation of Left-Censored Data (It’s particularly useful when missing values are expected to be low or below a detection limit.)
data_imp_qrilc <- impute( data_se_norm, fun = "QRILC") 
## Imputing along margin 2 (samples/columns).
# Plot intensity distributions before and after imputation
plot_imputation( data_se_norm, data_imp_MinProb, data_imp_man, data_imp_qrilc)

###Verdict: data_imp_Minprob seems to be  addressing MNAR the best, and will be used downstream. 

Differential abundance analysis using DEP

# Note: This analysis is intended as an extra quality control (QC) step.
# The main differential abundance analysis is performed using the `limma` package due to the more complex study design.
#Set contrasts
con <- c("RA_Control_4months_vs_RA_Sham_4months", "RA_Metformin_4months_vs_RA_Control_4months")

# Perform differential analysis using the specified contrasts - best on non-imputed data, honestly
data_diff <- test_diff(data_imp_qrilc, type = "manual", test = con)
## Tested contrasts: RA_Control_4months_vs_RA_Sham_4months, RA_Metformin_4months_vs_RA_Control_4months
# Denote significant proteins based on user defined cutoffs
dep <- add_rejections(data_diff, alpha = 0.05, lfc = log2(0))

# Generate a long data.frame
df_long <- get_df_long(dep)
df_wide <- get_df_wide(dep)

Volcano plots and DAE using DEP

# Plot a volcano plot
plot_volcano(dep, contrast = "RA_Control_4months_vs_RA_Sham_4months", label_size = 2, add_names = TRUE)

plot_volcano(dep, contrast = "RA_Metformin_4months_vs_RA_Control_4months", label_size = 2, add_names = TRUE)

# Plot a frequency plot of significant proteins for the different conditions
plot_cond(dep)

# Generate a results table
data_results <- get_results(dep)

# Number of significant proteins
data_results %>% filter(significant) %>% nrow()
## [1] 10

MDS-plots on non-imputed data

# MDS plots on non-imputed data
#Log-transformation
logCounts <- filtered_count

# For PCA plot, we will remove the NAs 
logCounts_noNA  <- na.omit(logCounts)

# Make DGE list and define design
annot_reordered <- annot[match(rownames(logCounts_noNA), annot$Genes), ]
d <- DGEList(counts = logCounts_noNA, genes = annot_reordered, samples = meta)
design <- model.matrix(~0 + Condition, d$samples)

# Remove batch
logCounts_batchRemoved <- removeBatchEffect(logCounts_noNA, 
                                            batch = as.factor(d$samples$Horse), 
                                            design = design)
## Coefficients not estimable: batch22 batch23
## Warning: Partial NA coefficients for 769 probe(s)
meta$Group <- factor(meta$Group, levels = names(publication_colors))

# Plot MDS without batch removal
mds_no_batch <- plotMDS(logCounts_noNA, plot = FALSE)

dims <- list(p1 = c(1,2), p2 = c(1,3), p3 = c(2,3), p4 = c(1,4))
mds_plot_no_batch <- list()

# Without labels (No batch correction)
for (i in seq_along(dims)) {
  mds_plot_no_batch[[i]] <- aamisc::ggMDS(mds = mds_no_batch,
                                          meta = d$samples, 
                                          dim = dims[[i]], 
                                          color.by = "Group", 
                                          shape.by = "Timepoint",
                                          legend.position = "right"
                                          ) + 
                            scale_color_manual(values = publication_colors)
}

plot_no_batch <- patchwork::wrap_plots(mds_plot_no_batch, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# With labels (No batch correction)
for (i in seq_along(dims)) {
  mds_plot_no_batch[[i]] <- aamisc::ggMDS(mds = mds_no_batch,
                                          meta = d$samples, 
                                          dim = dims[[i]], 
                                          color.by = "Group", 
                                          shape.by = "Timepoint",
                                          legend.position = "right",
                                          text.by = "Horse",
                                          text.size = 1.5
                                          ) + 
                            scale_color_manual(values = publication_colors)
}

plot_no_batch_with_labels <- patchwork::wrap_plots(mds_plot_no_batch, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# Compare: Batch removed vs. no batch removed
mds_batch <- plotMDS(logCounts_batchRemoved, plot = FALSE)
mds_plot_batch <- list()

# Without labels (With batch correction)
for (i in seq_along(dims)) {
  mds_plot_batch[[i]] <- aamisc::ggMDS(mds = mds_batch,
                                       meta = d$samples, 
                                       dim = dims[[i]], 
                                       color.by = "Group", 
                                       shape.by = "Timepoint",
                                       legend.position = "right"
                                       ) + 
                         scale_color_manual(values = publication_colors)
}

plot_batch <- patchwork::wrap_plots(mds_plot_batch, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# With labels (With batch correction)
for (i in seq_along(dims)) {
  mds_plot_batch[[i]] <- aamisc::ggMDS(mds = mds_batch,
                                       meta = d$samples, 
                                       dim = dims[[i]], 
                                       color.by = "Group", 
                                       shape.by = "Timepoint",
                                       legend.position = "right",
                                       text.by = "Horse",
                                       text.size = 1.5
                                       ) + 
                         scale_color_manual(values = publication_colors)
}

plot_batch_with_labels <- patchwork::wrap_plots(mds_plot_batch, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# Display the plots
print(plot_no_batch)

print(plot_no_batch_with_labels)

print(plot_batch)

print(plot_batch_with_labels)

Checking histograms and save/load imputations

# Histograms will be generated for four datasets:
# 1. `normalized_unimputed`: Normalized but not imputed data.
# 2. `imputed_data_MinProb`: Data imputed using the "MinProb" method.
# 3. `imputed_data_qrilc`: Data imputed using the "QRILC" method.
# 4. `unfiltered_count`: Raw log-transformed counts after initial filtering.

# Extract the assay data from data_se_norm (normalized_unimputed)
normalized_unimputed <- as.data.frame(assay(data_se_norm))
colnames(normalized_unimputed) <- colnames(filtered_count)
rownames(normalized_unimputed) <- rownames(data_se_norm)

# Extract the assay data from data_imp_MinProb (imputed)
imputed_data_MinProb <- as.data.frame(assay(data_imp_MinProb))
colnames(imputed_data_MinProb) <- colnames(filtered_count)
rownames(imputed_data_MinProb) <- rownames(data_se_norm)

# Extract the assay data from data_imp_qrilc
imputed_data_qrilc <- as.data.frame(assay(data_imp_qrilc))
colnames(imputed_data_qrilc) <- colnames(filtered_count)
rownames(imputed_data_qrilc) <- rownames(data_se_norm)

# Extract the assay data from data_imp_man
imputed_data_man <- as.data.frame(assay(data_imp_man))
colnames(imputed_data_man) <- colnames(filtered_count)
rownames(imputed_data_man) <- rownames(data_se_norm)

# Plot histogram for normalized data
par(mfrow = c(1, 4))
hist(as.vector(as.matrix(normalized_unimputed)), breaks = 50, main = "Normalized Data", xlab = "Intensity")
hist(as.vector(as.matrix(imputed_data_MinProb)), breaks = 50, main = "MinProb Imputed Data", xlab = "Intensity")
hist(as.vector(as.matrix(imputed_data_qrilc)), breaks = 50, main = "QRILC Imputed Data", xlab = "Intensity")
hist(as.vector(as.matrix(filtered_count)), breaks = 50, main = "Unfiltered Data", xlab = "Intensity")

# Define the output folder path
output_folder <- "../../../../Timecourse/analysis/01_dge/output/"

# Function to save a matrix with row names as a separate column
save_matrix <- function(matrix, file_path) {
  # Create a data frame from the matrix, including row names as a column
  matrix_df <- as.data.frame(matrix)
  matrix_df$GeneName <- rownames(matrix)
  fwrite(matrix_df, file = file_path, sep = "\t", row.names = FALSE)
}

# Function to load a matrix and restore row names
load_matrix <- function(file_path) {
  matrix_df <- fread(file_path, data.table = FALSE)
  rownames(matrix_df) <- matrix_df$GeneName
  matrix_df$GeneName <- NULL
  return(as.matrix(matrix_df))  # Convert back to a matrix
}

# Save the matrices
 # save_matrix(normalized_unimputed, paste0(output_folder, "normalized_unimputed.tsv"))
 # save_matrix(imputed_data_MinProb, paste0(output_folder, "imputed_data_MinProb.tsv"))
 # save_matrix(imputed_data_qrilc, paste0(output_folder, "imputed_data_qrilc.tsv"))

# Load the matrices in the future
normalized_unimputed <- load_matrix(paste0(output_folder, "normalized_unimputed.tsv"))
imputed_data_MinProb <- load_matrix(paste0(output_folder, "imputed_data_MinProb.tsv"))
imputed_data_qrilc <- load_matrix(paste0(output_folder, "imputed_data_qrilc.tsv"))

Limma with SVA

# In this workflow, SVA is used to remove hidden variance, and a blocking factor 
# accounts for variance introduced by repeated measures (e.g., two samples from the same horse).

# Extract biological condition information
condition <- d$samples$Condition  # Specify the biological condition

# Apply SVA to Adjust for Hidden Confounders
# Full model including condition
mod <- model.matrix(~ 0 + condition, data = d$samples)

# Null model excluding biological information
mod0 <- model.matrix(~ 1, data = d$samples)

# Estimate the number of surrogate variables
n.sv <- num.sv(imputed_data_MinProb, mod, method = "be")  
cat("Number of surrogate variables estimated:", n.sv, "\n")
## Number of surrogate variables estimated: 8
# Run SVA to estimate surrogate variables
svobj <- sva(imputed_data_MinProb, mod, mod0, n.sv = n.sv)
## Number of significant surrogate variables is:  8 
## Iteration (out of 5 ):1  2  3  4  5
# Visualize Surrogate Variables to Ensure No Biological Information is Captured
# Prepare SV data for plotting
sv_data <- as.data.frame(svobj$sv)
colnames(sv_data) <- paste0("SV", seq_len(ncol(svobj$sv)))
sv_data <- cbind(d$samples, sv_data)

# Generate pairwise scatter plots of surrogate variables
sv_plots <- list()  # Store plots
sv_cols <- colnames(sv_data)[grep("^SV", colnames(sv_data))]

for (i in seq_along(sv_cols)) {
  for (j in seq_along(sv_cols)) {
    if (i < j) {  # Plot each pair only once
      sv_plots[[paste0("SV", i, "_SV", j)]] <- ggplot(sv_data, aes_string(x = sv_cols[i], y = sv_cols[j], 
                                                                          color = "Group", shape = "Timepoint")) +
        geom_point(size = 3, alpha = 0.8) +
        theme_minimal() +
        labs(title = paste("Surrogate Variables:", sv_cols[i], "vs", sv_cols[j]),
             x = paste("Surrogate Variable", i),
             y = paste("Surrogate Variable", j))
    }
  }
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print all scatter plots for manual inspection
for (plot_name in names(sv_plots)) {
  print(sv_plots[[plot_name]])
}

cat("None of the surrogate variables captured variability related to disease group.\n")
## None of the surrogate variables captured variability related to disease group.
# Incorporate Surrogate Variables into the Design Matrix
modSv <- cbind(mod, svobj$sv)

# Run duplicateCorrelation to estimate correlation between repeated samples (blocking by horse)
corfit <- duplicateCorrelation(imputed_data_MinProb, modSv, block = as.factor(d$samples$Horse))
cat("Consensus correlation for repeated measures:", corfit$consensus.correlation, "\n")
## Consensus correlation for repeated measures: 0.1004124
# Fit the linear model with limma, accounting for repeated measures
fit <- lmFit(imputed_data_MinProb, modSv, block = as.factor(d$samples$Horse), correlation = corfit$consensus.correlation)

# Ensure column names in design matrix are syntactically valid
colnames(modSv) <- make.names(colnames(modSv))

# Define contrasts with the updated column names
con <- makeContrasts(
  Metformin_vs_AF_RA = conditionRA_Metformin_4months - conditionRA_Control_4months,
  AF_vs_Sham_RA = conditionRA_Control_4months - conditionRA_Sham_4months,
  Terminal_vs_Baseline_Control = conditionRA_Control_4months - conditionRA_Control_Baseline,
  Terminal_vs_Baseline_Metformin = conditionRA_Metformin_4months - conditionRA_Metformin_Baseline,
  Diff_Treatment = (conditionRA_Metformin_4months - conditionRA_Metformin_Baseline) - (conditionRA_Control_4months - conditionRA_Control_Baseline),
  Diff_Disease = (conditionRA_Control_4months - conditionRA_Control_Baseline) - (conditionRA_Sham_4months - conditionRA_Sham_Baseline),
  Baseline_Difference_Metf_vs_AF = conditionRA_Metformin_Baseline - conditionRA_Control_Baseline,
  Baseline_Difference_AF_vs_Sham = conditionRA_Control_Baseline - conditionRA_Sham_Baseline,
  levels = modSv)
con
##                                 Contrasts
## Levels                           Metformin_vs_AF_RA AF_vs_Sham_RA
##   conditionRA_Control_4months                    -1             1
##   conditionRA_Control_Baseline                    0             0
##   conditionRA_Metformin_4months                   1             0
##   conditionRA_Metformin_Baseline                  0             0
##   conditionRA_Sham_4months                        0            -1
##   conditionRA_Sham_Baseline                       0             0
##   X                                               0             0
##   X                                               0             0
##   X                                               0             0
##   X                                               0             0
##   X                                               0             0
##   X                                               0             0
##   X                                               0             0
##   X                                               0             0
##                                 Contrasts
## Levels                           Terminal_vs_Baseline_Control
##   conditionRA_Control_4months                               1
##   conditionRA_Control_Baseline                             -1
##   conditionRA_Metformin_4months                             0
##   conditionRA_Metformin_Baseline                            0
##   conditionRA_Sham_4months                                  0
##   conditionRA_Sham_Baseline                                 0
##   X                                                         0
##   X                                                         0
##   X                                                         0
##   X                                                         0
##   X                                                         0
##   X                                                         0
##   X                                                         0
##   X                                                         0
##                                 Contrasts
## Levels                           Terminal_vs_Baseline_Metformin Diff_Treatment
##   conditionRA_Control_4months                                 0             -1
##   conditionRA_Control_Baseline                                0              1
##   conditionRA_Metformin_4months                               1              1
##   conditionRA_Metformin_Baseline                             -1             -1
##   conditionRA_Sham_4months                                    0              0
##   conditionRA_Sham_Baseline                                   0              0
##   X                                                           0              0
##   X                                                           0              0
##   X                                                           0              0
##   X                                                           0              0
##   X                                                           0              0
##   X                                                           0              0
##   X                                                           0              0
##   X                                                           0              0
##                                 Contrasts
## Levels                           Diff_Disease Baseline_Difference_Metf_vs_AF
##   conditionRA_Control_4months               1                              0
##   conditionRA_Control_Baseline             -1                             -1
##   conditionRA_Metformin_4months             0                              0
##   conditionRA_Metformin_Baseline            0                              1
##   conditionRA_Sham_4months                 -1                              0
##   conditionRA_Sham_Baseline                 1                              0
##   X                                         0                              0
##   X                                         0                              0
##   X                                         0                              0
##   X                                         0                              0
##   X                                         0                              0
##   X                                         0                              0
##   X                                         0                              0
##   X                                         0                              0
##                                 Contrasts
## Levels                           Baseline_Difference_AF_vs_Sham
##   conditionRA_Control_4months                                 0
##   conditionRA_Control_Baseline                                1
##   conditionRA_Metformin_4months                               0
##   conditionRA_Metformin_Baseline                              0
##   conditionRA_Sham_4months                                    0
##   conditionRA_Sham_Baseline                                  -1
##   X                                                           0
##   X                                                           0
##   X                                                           0
##   X                                                           0
##   X                                                           0
##   X                                                           0
##   X                                                           0
##   X                                                           0
# Apply contrasts and run eBayes
fit <- contrasts.fit(fit, con)
## Warning in contrasts.fit(fit, con): row names of contrasts don't match col
## names of coefficients
fit <- eBayes(fit, robust = TRUE, trend = TRUE)

# Extract DGE results using the BH method for FDR correction
res <- list()  # List to store DGE results
for (i in colnames(con)) {
  res_tmp <- topTable(fit, coef = i, adjust.method = "BH", number = Inf)  # Get top table results
  res_tmp <- res_tmp[!is.na(res_tmp$t), ]  # Remove rows with NA values
  res_tmp$Contrast <- i #TODO: replaced this part >>> rep(i, nrow(res_tmp))  # Store the contrast name
  res[[i]] <- res_tmp  # Add to the results list
  
  # Print the number of differentially expressed genes based on adjusted p-values
  n_adj_pval <- nrow(res_tmp[res_tmp$adj.P.Val < 0.05, ])
  print(paste('Number of differentially expressed genes for', i, 'based on adjusted p-value (BH) =', n_adj_pval))
}
## [1] "Number of differentially expressed genes for Metformin_vs_AF_RA based on adjusted p-value (BH) = 0"
## [1] "Number of differentially expressed genes for AF_vs_Sham_RA based on adjusted p-value (BH) = 8"
## [1] "Number of differentially expressed genes for Terminal_vs_Baseline_Control based on adjusted p-value (BH) = 359"
## [1] "Number of differentially expressed genes for Terminal_vs_Baseline_Metformin based on adjusted p-value (BH) = 394"
## [1] "Number of differentially expressed genes for Diff_Treatment based on adjusted p-value (BH) = 1"
## [1] "Number of differentially expressed genes for Diff_Disease based on adjusted p-value (BH) = 10"
## [1] "Number of differentially expressed genes for Baseline_Difference_Metf_vs_AF based on adjusted p-value (BH) = 1"
## [1] "Number of differentially expressed genes for Baseline_Difference_AF_vs_Sham based on adjusted p-value (BH) = 42"
# Combine all results into a single data frame
res_all <- do.call(rbind, res)

# Map Gene Names Manually
res_all$GeneName <- sapply(seq_len(nrow(res_all)), function(i) {
  gsub(paste0("^", res_all$Contrast[i], "\\."), "", rownames(res_all)[i])
})

# Split results by contrast for easier output
res_split <- split(res_all, res_all$Contrast)

# Optionally, save the results to files
# openxlsx::write.xlsx(x = res_split, file = "../../../../Timecourse/analysis/01_dge/output/dge_results_sva_combat.xlsx", asTable = TRUE)
# data.table::fwrite(x = res_all, file = "../../../../Timecourse/analysis/01_dge/output/dge_results_sva.tsv.gz", sep = "\t")

# Visualize Results with P-value Histograms
for (contrast in names(res_split)) {
  p_values <- res_split[[contrast]]$P.Value
  ggplot(data = data.frame(P.Value = p_values), aes(x = P.Value)) +
    geom_histogram(breaks = seq(0, 1, by = 0.05), color = "black", fill = "lightblue") +
    theme_minimal() +
    labs(title = paste("P-value Histogram for", contrast),
         x = "P-value",
         y = "Frequency") +
    xlim(0, 1)  # Set x-axis limits
  print(last_plot())
}

Volcano plots

volcano_plots <- list()
for (i in names(res)){
  volcano_plots[[i]] <- ggVolcano(x = res[[i]], 
                                  fdr = 0.05,
                                  fdr.column = "adj.P.Val", 
                                  pvalue.column = "P.Value", 
                                  logFC = 0, 
                                  logFC.column = "logFC", 
                                  text.size = 2) + 
    theme_bw(base_size = 10) + 
    ggtitle(i)
}
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
patchwork::wrap_plots(volcano_plots, ncol = 3)

Better visualization of Volcano plots

library(ggrepel)

# Create a named vector for ENSEMBL to Genes mapping
ensembl_to_Genes <- setNames(annot_reordered$Genes, annot_reordered$ENSEMBL)

# Map ENSEMBL IDs to Gene Names in the `res` list
for (contrast_name in names(res)) {
  # Ensure the dataframe has ENSEMBL IDs as rownames
  if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
    res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
  }
  
  # Map Gene Names using the annotation
  res[[contrast_name]]$Genes <- ensembl_to_Genes[res[[contrast_name]]$ENSEMBL]
  
  # Replace NA values in Genes with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
  res[[contrast_name]]$Genes[is.na(res[[contrast_name]]$Genes)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$Genes)]
}

# Load the volcano plot helper function
source("volcano_helpers.R")

# Create lists for volcano plots with and without labels
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()

# Iterate over each contrast and create volcano plots
for (contrast_name in names(res)) {
  # Ensure the Genes column is present for labeling
  if (!"Genes" %in% colnames(res[[contrast_name]])) {
    res[[contrast_name]]$Genes <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
  }
  
  # Generate volcano plots using the helper function
  volcano_plots <- create_custom_volcano_plot(
    df = res[[contrast_name]],
    logFC_col = "logFC",
    pvalue_col = "P.Value",
    adj_pvalue_col = "adj.P.Val",
    contrast_name = contrast_name,
    fc_cutoff = 0,  # Set fold-change cutoff for significance
    pvalue_cutoff = 0.05,  # Set p-value cutoff
    save_plot = TRUE, 
    output_path = "../output/",  # Adjust output path if needed
    show_labels = TRUE  # Generate both labeled and unlabeled plots
  )
  
  # Store the plots
  volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
  volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}
## Warning: ggrepel: 297 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 323 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Combine and display volcano plots without labels
combined_volcano_no_labels <- patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)

# Combine and display volcano plots with labels
combined_volcano_with_labels <- patchwork::wrap_plots(volcano_plots_with_labels, ncol = 3)

# Display the combined volcano plots
print(combined_volcano_no_labels)

print(combined_volcano_with_labels)
## Warning: ggrepel: 354 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 390 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Print individual volcano plots with labels for key contrasts
print(volcano_plots_with_labels[["Diff_Treatment"]])

print(volcano_plots_with_labels[["Diff_Disease"]])

# ggsave("../output/volcano_Diff_Treatment_for_figure.png", (volcano_plots_with_labels[["Diff_Treatment"]]), 
#        dpi = 600, width = 4, height = 3, units = "in")

Plotting One

Understanding Treatment Direction and Plotting Protein Abundance

# Understanding LogFC for "Diff_Treatment"
# Positive LogFC indicates:
# 1) Metformin group shows a relatively higher level compared to the control group.
# 2) This can occur if:
#    a) Metformin increases and control decreases.
#    b) Both increase, but metformin shows a greater increase.
#    c) Both decrease, but metformin shows a smaller decrease.

# Negative LogFC indicates:
# 1) Metformin group shows a relatively lower level compared to the control group.
# 2) This can occur if:
#    a) Metformin decreases and control increases.
#    b) Both decrease, but metformin shows a greater decrease.
#    c) Both increase, but metformin shows a smaller increase.

# Load ggpubr for enhanced visualization
library(ggpubr)

# Step 1: Define the function to create a violin plot with individual points and mean line
plot_protein_counts <- function(protein_of_interest) {
  # Check if the protein is present in the dataset
  if (protein_of_interest %in% rownames(data_se_norm)) {
    # Extract the normalized counts for the protein
    protein_counts <- assay(data_se_norm)[protein_of_interest,]
    
    # Create a data frame for plotting
    protein_df <- data.frame(
      SampleID = colnames(data_se_norm), # Extract sample IDs
      Count = protein_counts,            # Protein counts
      Condition = meta$Condition         # Experimental conditions
    )
    
    # Generate violin plot with jitter points and mean line
    p <- ggplot(protein_df, aes(x = Condition, y = Count, fill = Condition)) +
      geom_violin(trim = FALSE, alpha = 0.5) +                        # Violin plot
      geom_jitter(width = 0.2, size = 2, alpha = 0.7) +               # Add individual points
      stat_summary(fun = mean, geom = "crossbar", width = 0.5, fatten = 2, color = "red") +  # Mean line
      labs(
        title = paste("Violin plot of normalized counts for", protein_of_interest),  # Plot title
        x = "Condition", y = "Normalized Abundance"                                  # Axis labels
      ) +
      theme_pubr() +                                                  # Apply a clean theme
      scale_fill_brewer(palette = "Set3")                             # Color palette

    print(p)  # Display the plot
  } else {
    # Output message if protein is not found
    cat("The protein", protein_of_interest, "is not present in the dataset.\n")
  }
}

# Step 2: Visualize selected proteins with significant treatment effects to understand, why it is upregulated and downregulated (see comment in beginning of chunk)

# A) Electron Transport Chain Genes - negatively regulated (blue) in "Diff_Treatment"
plot_protein_counts("NDUFA6")   # Associated with metformin in studies related to Aortic Aneurysms, increases less in metformin group

plot_protein_counts("ATP5F1C")  # Important candidate gene for treatment effects, small decrease in metformin group

plot_protein_counts("KARS1")    # Downregulated in response to treatment 

# B) Proteasome Proteins - negatively regulated (blue) in "Diff_Treatment"
plot_protein_counts("PSMC5")    # Protein linked to proteasome function

plot_protein_counts("PSMC4")    # Another candidate for proteasome-associated mechanisms

plot_protein_counts("PSMD11")   # Related to proteasome degradation processes

# C) Heat Shock Proteins (Hsp90 Family) - negatively regulated (blue) in "Diff_Treatment"
plot_protein_counts("SUGT1")    # Chaperone activity protein
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_summary()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

plot_protein_counts("HSP90AA1") # Hsp90 protein linked to stress response

plot_protein_counts("DYNC1H1")  # Motor protein involved in cellular transport

# D) Detoxification of Reactive Oxygen Species (ROS) - positively regulated (red) in "Diff_Treatment"
plot_protein_counts("TXNRD2")   # Key enzyme in ROS detoxification

plot_protein_counts("TXN")      # Thioredoxin, involved in redox regulation

plot_protein_counts("SOD3")     # Superoxide dismutase, a primary ROS scavenger

# Additional Proteins of Interest
plot_protein_counts("DDAH1")    # Dimethylarginine Dimethylaminohydrolase 1

plot_protein_counts("COQ8A")    # Coenzyme Q8 homolog

plot_protein_counts("RICTOR")   # Component of mTOR complex
## The protein RICTOR is not present in the dataset.
plot_protein_counts("YWHAE")    # 14-3-3 protein epsilon, signaling protein

plot_protein_counts("TXNDC5")   # Protein disulfide isomerase, ROS response

plot_protein_counts("GNAI2")

Focus on GNAI2

# Define the function to create a violin plot with individual points and mean line
plot_protein_counts <- function(protein_of_interest) {
  # Check if the protein is present in the dataset
  if (protein_of_interest %in% rownames(data_se_norm)) {
    # Extract the normalized counts for the protein
    protein_counts <- assay(data_se_norm)[protein_of_interest,]
    
    # Create a data frame for plotting
    protein_df <- data.frame(
      SampleID = colnames(data_se_norm), # Extract sample IDs
      Count = protein_counts,            # Protein counts
      Condition = meta$Condition         # Experimental conditions
    )
    
    # Generate violin plot with jitter points and mean line
    p <- ggplot(protein_df, aes(x = Condition, y = Count, fill = Condition)) +
      geom_violin(trim = FALSE, alpha = 0.7, color = "black") +       # Violin plot with border
      geom_jitter(width = 0.15, size = 1.5, alpha = 0.6, color = "black") +  # Individual points
      stat_summary(fun = mean, geom = "crossbar", width = 0.5, fatten = 2, color = "darkred") +  # Mean line
      labs(
        title = paste("Normalized Protein Counts for", protein_of_interest),  # Plot title
        x = "Condition", y = "Normalized Abundance"                           # Axis labels
      ) +
      theme_pubr() +                                                          # Apply a clean, publication-ready theme
      scale_fill_brewer(palette = "Set3") +                                   # Use a color palette with enough colors
      theme(
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),     # Centered and bold title
        axis.title = element_text(size = 12, face = "bold"),                  # Bold axis titles
        axis.text = element_text(size = 10, color = "black"),                 # Custom axis text
        legend.position = "none"                                              # Remove legend
      )
    
    # Print the plot
    print(p)
  } else {
    # Output message if the protein is not found
    cat("The protein", protein_of_interest, "is not present in the dataset.\n")
  }
}

# Plot the protein counts for "GNAI2"
plot_protein_counts("GNAI2")

MDS plots and SVA

# Transform raw count data and remove NA values
logCounts_noNA <- na.omit(logCounts)

# Create DGEList object and define the design matrix
annot_reordered <- annot[match(rownames(logCounts_noNA), annot$Genes), ]
d <- DGEList(counts = logCounts_noNA, genes = annot_reordered, samples = meta)
design <- model.matrix(~0 + Condition, data = d$samples)

# 1. MDS Plot for Raw Data
mds_raw <- plotMDS(logCounts_noNA, plot = FALSE)
mds_plot_raw <- aamisc::ggMDS(mds = mds_raw, meta = d$samples, color.by = "Group", shape.by = "Timepoint", legend.position = "right") +
  scale_color_manual(values = publication_colors) +
  labs(title = "MDS: Raw Data", x = "MDS Dimension 1", y = "MDS Dimension 2") +
  theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))

# 2. MDS Plot for Data Adjusted by Blocking for Horse
logCounts_blocked <- removeBatchEffect(logCounts_noNA, batch = as.factor(d$samples$Horse), design = design)
## Coefficients not estimable: batch22 batch23
## Warning: Partial NA coefficients for 769 probe(s)
mds_blocked <- plotMDS(logCounts_blocked, plot = FALSE)
mds_plot_blocked <- aamisc::ggMDS(mds = mds_blocked, meta = d$samples, color.by = "Group", shape.by = "Timepoint", legend.position = "right") +
  scale_color_manual(values = publication_colors) +
  labs(title = "MDS: Blocked for Horse", x = "MDS Dimension 1", y = "MDS Dimension 2") +
  theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))

# 3. MDS Plot for Data Adjusted Using ComBat
combat_data <- ComBat(dat = logCounts_noNA, batch = as.factor(d$samples$Batch), mod = model.matrix(~Condition, data = d$samples))
## Found4batches
## Adjusting for5covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
mds_combat <- plotMDS(combat_data, plot = FALSE)
mds_plot_combat <- aamisc::ggMDS(mds = mds_combat, meta = d$samples, color.by = "Group", shape.by = "Timepoint", legend.position = "right") +
  scale_color_manual(values = publication_colors) +
  labs(title = "MDS: ComBat Adjusted", x = "MDS Dimension 1", y = "MDS Dimension 2") +
  theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))

# 4. MDS Plot for Data Adjusted Using SVA
mod <- model.matrix(~0 + Condition, data = d$samples)
mod0 <- model.matrix(~1, data = d$samples)
n.sv <- num.sv(logCounts_noNA, mod, method = "be")
logCounts_noNA <- as.matrix(logCounts_noNA)
storage.mode(logCounts_noNA) <- "numeric"
svobj <- sva(logCounts_noNA, mod, mod0, n.sv = n.sv)
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
modSv <- cbind(mod, svobj$sv)
logCounts_sva <- removeBatchEffect(logCounts_noNA, batch = d$samples$Horse, design = modSv)
## Coefficients not estimable: batch22 batch23
## Warning: Partial NA coefficients for 769 probe(s)
mds_sva <- plotMDS(logCounts_sva, plot = FALSE)
mds_plot_sva <- aamisc::ggMDS(mds = mds_sva, meta = d$samples, color.by = "Group", shape.by = "Timepoint", legend.position = "right") +
  scale_color_manual(values = publication_colors) +
  labs(title = "MDS: SVA Adjusted", x = "MDS Dimension 1", y = "MDS Dimension 2") +
  theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))

# 5. MDS Plot for Data Adjusted Using SVA (with Labels)
mds_plot_sva_labelled <- aamisc::ggMDS(mds = mds_sva, meta = d$samples, color.by = "Group", shape.by = "Timepoint", legend.position = "right", text.by = "Horse", text.size = 1.5) +
  scale_color_manual(values = publication_colors) +
  labs(title = "MDS: SVA Adjusted (Labelled)", x = "MDS Dimension 1", y = "MDS Dimension 2") +
  theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))

# Display the plots
print(mds_plot_raw)

print(mds_plot_combat)

print(mds_plot_blocked)

print(mds_plot_sva)

print(mds_plot_sva_labelled)

Session Info

sessionInfo()
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Danish_Denmark.utf8  LC_CTYPE=Danish_Denmark.utf8   
## [3] LC_MONETARY=Danish_Denmark.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=Danish_Denmark.utf8    
## 
## time zone: Europe/Copenhagen
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0                ggrepel_0.9.5              
##  [3] sva_3.50.0                  BiocParallel_1.36.0        
##  [5] genefilter_1.84.0           mgcv_1.9-1                 
##  [7] nlme_3.1-163                aamisc_0.1.5               
##  [9] fdrtool_1.2.17              Metrics_0.1.4              
## [11] SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [13] GenomicRanges_1.54.1        GenomeInfoDb_1.38.8        
## [15] IRanges_2.36.0              S4Vectors_0.40.2           
## [17] BiocGenerics_0.48.1         MatrixGenerics_1.14.0      
## [19] DEP_1.24.0                  preprocessCore_1.64.0      
## [21] DEqMS_1.20.0                matrixStats_1.2.0          
## [23] RColorBrewer_1.1-3          missForest_1.5             
## [25] dplyr_1.1.4                 openxlsx_4.2.5.2           
## [27] patchwork_1.2.0             data.table_1.14.10         
## [29] ggplot2_3.5.0               stringr_1.5.1              
## [31] tibble_3.2.1                magrittr_2.0.3             
## [33] biomaRt_2.58.2              readxl_1.4.3               
## [35] readr_2.1.5                 edgeR_4.0.16               
## [37] limma_3.58.1                pacman_0.5.1               
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.2           later_1.3.2             norm_1.0-11.1          
##   [4] bitops_1.0-7            filelock_1.0.3          R.oo_1.26.0            
##   [7] cellranger_1.1.0        XML_3.99-0.16.1         lifecycle_1.0.4        
##  [10] rstatix_0.7.2           doParallel_1.0.17       vroom_1.6.5            
##  [13] lattice_0.21-9          MASS_7.3-60             backports_1.5.0        
##  [16] sass_0.4.9              rmarkdown_2.27          jquerylib_0.1.4        
##  [19] yaml_2.3.8              httpuv_1.6.13           doRNG_1.8.6            
##  [22] zip_2.3.1               MsCoreUtils_1.14.1      DBI_1.2.3              
##  [25] abind_1.4-5             zlibbioc_1.48.0         R.utils_2.12.3         
##  [28] purrr_1.0.2             HarmonicRegression_1.0  itertools_0.1-3        
##  [31] RCurl_1.98-1.14         rappdirs_0.3.3          sandwich_3.1-0         
##  [34] circlize_0.4.16         GenomeInfoDbData_1.2.11 annotate_1.80.0        
##  [37] MSnbase_2.28.1          ncdf4_1.22              codetools_0.2-20       
##  [40] DelayedArray_0.28.0     DT_0.33                 xml2_1.3.6             
##  [43] tidyselect_1.2.1        gmm_1.8                 shape_1.4.6.1          
##  [46] farver_2.1.1            gmp_0.7-4               BiocFileCache_2.10.2   
##  [49] jsonlite_1.8.8          multtest_2.58.0         GetoptLong_1.0.5       
##  [52] survival_3.5-8          iterators_1.0.14        systemfonts_1.1.0      
##  [55] foreach_1.5.2           tools_4.3.2             progress_1.2.3         
##  [58] ragg_1.3.2              Rcpp_1.0.12             glue_1.7.0             
##  [61] gridExtra_2.3           SparseArray_1.2.3       xfun_0.45              
##  [64] qvalue_2.34.0           shinydashboard_0.7.2    withr_3.0.0            
##  [67] BiocManager_1.30.23     fastmap_1.1.1           fansi_1.0.6            
##  [70] digest_0.6.34           R6_2.5.1                mime_0.12              
##  [73] textshaping_0.4.0       imputeLCMD_2.1          colorspace_2.1-0       
##  [76] Cairo_1.6-2             RSQLite_2.3.5           R.methodsS3_1.8.2      
##  [79] hexbin_1.28.3           utf8_1.2.4              tidyr_1.3.0            
##  [82] generics_0.1.3          prettyunits_1.2.0       httr_1.4.7             
##  [85] htmlwidgets_1.6.4       S4Arrays_1.2.0          pkgconfig_2.0.3        
##  [88] NISTunits_1.0.1         gtable_0.3.5            blob_1.2.4             
##  [91] ComplexHeatmap_2.18.0   impute_1.76.0           XVector_0.42.0         
##  [94] htmltools_0.5.8.1       carData_3.0-5           rain_1.36.0            
##  [97] MALDIquant_1.22.2       ProtGenerics_1.34.0     clue_0.3-65            
## [100] scales_1.3.0            tmvtnorm_1.6            png_0.1-8              
## [103] knitr_1.47              rstudioapi_0.16.0       reshape2_1.4.4         
## [106] tzdb_0.4.0              rjson_0.2.21            curl_5.2.0             
## [109] cachem_1.0.8            zoo_1.8-12              GlobalOptions_0.1.2    
## [112] parallel_4.3.2          AnnotationDbi_1.64.1    mzID_1.40.0            
## [115] vsn_3.70.0              pillar_1.9.0            grid_4.3.2             
## [118] vctrs_0.6.5             pcaMethods_1.94.0       promises_1.2.1         
## [121] randomForest_4.7-1.1    car_3.1-2               dbplyr_2.5.0           
## [124] xtable_1.8-4            cluster_2.1.6           evaluate_0.24.0        
## [127] mvtnorm_1.2-4           cli_3.6.2               locfit_1.5-9.8         
## [130] compiler_4.3.2          rlang_1.1.3             crayon_1.5.3           
## [133] rngtools_1.5.2          ggsignif_0.6.4          labeling_0.4.3         
## [136] affy_1.80.0             plyr_1.8.9              stringi_1.8.3          
## [139] assertthat_0.2.1        munsell_0.5.1           Biostrings_2.70.1      
## [142] Matrix_1.6-5            hms_1.1.3               bit64_4.0.5            
## [145] KEGGREST_1.42.0         statmod_1.5.0           shiny_1.8.1.1          
## [148] highr_0.11              mzR_2.36.0              broom_1.0.6            
## [151] memoise_2.0.1           affyio_1.72.0           bslib_0.7.0            
## [154] bit_4.0.5